home *** CD-ROM | disk | FTP | other *** search
- /*
- hth.c
- v 1.0.5
- 28 December 1993
-
- This simple program predicts whether a protein contains a helix-turn-
- helix motif, using the method of:
-
- Dodd, I. B., and J. B. Egan. 1990. Improved detection of helix-turn-
- helix DNA-binding motifs in protein sequences. Nucleic Acids Res.
- 18:5019-5026.
-
- This code written and donated to the public domain by:
- Conrad Halling
- Department of Molecular Genetics and Cell Biology
- University of Chicago
- 920 E 58th St
- Chicago IL 60637
-
- current address:
-
- Monsanto Co.
- 700 Chesterfield Parkway North
- St. Louis MO 63198
- e-mail: chhall@bb1t.monsanto.com
-
- How to compile this program:
-
- This program is written in ANSI C using THINK C 5.0.4.
- On the UNIX system at the University of Chicago (running SunOS Release 4.1.1),
- this code will not compile using cc but will compile using either gcc or acc.
- For example, to compile this program, you would type
- acc -o hth hth.c <return>
- This means, "start the acc (ANSI C Compiler) program, send
- the output to a file called "hth", and take the input
- from the file "hth.c".
- When the program is compiled, you run it by typing "hth"
- at the prompt.
-
- tabs = 4
-
- When using vi under UNIX, you can set the tabs to 4 by opening the
- file, typing escape (to go into command mode), colon (":") to go
- to the command line, and "set tabstop=4" (without the quotes).
-
- Format of input protein sequence:
-
- One protein sequence per file
- Single-letter code in upper case and/or lower case
- White space characters (space, tab, return, etc.) are ignored
- The program will abort if an invalid character is found
- */
-
- #include <ctype.h>
- #include <limits.h>
- #include <stddef.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
-
- #ifndef __HTH__
- #define __HTH__
-
- #ifndef TRUE
- #define TRUE 1
- #endif
-
- #ifndef FALSE
- #define FALSE 0
- #endif
-
- #define AMINO_ACIDS_COUNT 20
- #define MAX_SEQUENCE_LENGTH 20000
-
- #define WINDOW_SIZE 22 /* These values from Table 3 */
- #define NON_HTH_MEAN_SCORE 238.71 /* of Dodd and Egan (1991) */
- #define NON_HTH_STD_DEV 293.61
-
- /*
- Errors
- */
-
- #define NO_ERROR 0
- #define SEQUENCE_TOO_SHORT 1
- #define OUT_OF_MEMORY 2
- #define QUIT 3
- #define INVALID_CHAR 4
-
- /*
- Function prototypes
- */
-
- void DisplayError(
- short error );
- void DisplayResults(
- double convertedScore,
- size_t maxScorePosition,
- const char *sequence );
- short GetAminoAcid(
- char residue );
-
- #endif
-
-
- const short weightMatrix[ AMINO_ACIDS_COUNT ][ WINDOW_SIZE ] =
- {
- /* A (alanine) */
- -125, -194, -84, 70, 36, 54, 238, -15, 77, 26, -194,
- -194, -56, -84, 14, 77, -56, -56, -56, 46, -195, 36,
- /* C (cysteine) */
- -64, -64, -63, -63, -64, -64, -64, -64, -64, 47, 47,
- -63, -63, -64, -64, -64, -64, -64, -64, -63, -64, 47,
- /* D (aspartate) */
- -156, -154, -156, -154, 109, -156, -156, 109, -154, -156, 6,
- -156, -154, -85, -156, -156, -156, -156, -154, -154, -156, -85,
- /* E (glutamate) */
- -31, -9, -171, 70, 156, -171, -171, 107, 50, -60, -60,
- -171, -60, 78, 86, -171, -171, -101, -171, -170, 86, 9,
- /* F (phenylalanine) */
- 10, -130, 10, -130, -130, 10, -130, -129, -130, 102, -130,
- -130, -130, -130, -129, -130, -130, -129, -129, -129, 180, -130,
- /* G (glycine) */
- 30, 5, -190, -51, -191, -191, 18, -191, -191, -191, 202,
- -191, -10, -191, 5, -190, -191, -80, -190, -190, -191, -51,
- /* H (histidine ) */
- 62, 33, -76, -76, -7, -78, -78, 33, -7, -78, 84,
- -78, 33, 33, -78, -7, -78, -7, 62, 84, -78, -7,
- /* I (isoleucine ) */
- 75, -156, 101, -45, -86, 116, -156, -16, 65, -16, -156,
- 128, -156, -86, -156, -155, 188, -155, -16, 53, 122, -155,
- /* K (lysine ) */
- -31, -31, 10, 70, 79, -170, -170, 94, 70, -171, -9,
- -100, -100, 25, -100, -170, -171, -9, 38, -31, -9, 101,
- /* L (leucine) */
- 66, -212, 72, -213, -212, 144, -213, -102, 37, 132, -213,
- 97, -213, -142, -212, -212, 97, -212, -212, 37, 88, -213,
- /* M (methionine) */
- 122, -74, -3, -73, -73, -73, -74, -74, 88, 122, -73,
- 158, -74, -74, -3, -74, -3, -74, -74, -3, -74, -73,
- /* N (asparagine) */
- -137, 72, -137, -136, -137, -137, -137, -67, -136, -136, 128,
- -137, 72, -136, 2, -67, -137, 2, 84, -137, -137, 104,
- /* P (proline) */
- -156, 23, -157, -156, -157, -157, -157, -157, -157, -157, -157,
- -157, -46, 101, 39, -157, -157, -157, -157, -157, -157, -46,
- /* Q (glutamine) */
- -60, -130, 175, 90, 110, -131, -131, 90, 78, -131, -131,
- -60, -130, 154, 65, 119, -131, -20, 119, 31, -20, 90,
- /* R (arginine) */
- 65, 76, 110, 65, 7, -155, -154, 123, 76, -155, -154,
- -155, -154, 129, 54, 40, -155, 129, 179, -45, -155, 123,
- /* S (serine) */
- -118, 96, -188, 21, -48, -187, -8, -187, -118, -118, -77,
- -188, 174, -187, 135, -26, -188, 150, -77, -188, -187, -26,
- /* T (threonine) */
- 11, 149, -59, 80, -169, -8, -170, -99, -99, -30, -170,
- -8, 131, -30, -59, 198, -170, -30, -169, -170, -30, -59,
- /* V (valine) */
- 17, -67, -177, -177, -108, 100, -178, -178, -108, 71, -178,
- 160, -178, -16, -67, -67, 169, -178, 17, 31, 17, -178,
- /* W (tryptophan) */
- 44, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26,
- -26, -26, -25, -26, -25, -26, -26, 44, 279, -26, -26,
- /* Y (tyrosine) */
- -40, -110, 30, 1, -110, -109, -110, -110, -40, 30, -110,
- -109, -109, -40, -110, -40, -110, 162, 52, 86, -110, -110
- };
-
- const char aminoAcidsString[] = "ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy";
-
-
- int main( void )
- {
- char fileName[ FILENAME_MAX ],
- format[ 12 ],
- residue,
- *sequence;
- int resultsDisplayed = FALSE,
- theChar;
- short aminoAcid,
- fileNameEntered,
- maxScore,
- status,
- tempScore;
- size_t i,
- j,
- length,
- maxWindowPosition,
- maxScorePosition,
- position,
- sequenceLength;
- double convertedScore;
- FILE *sequenceFile;
-
- status = NO_ERROR;
-
- printf( "Welcome to hth, a program that predicts whether a protein contains\n" );
- printf( "a helix-turn-helix motif.\n\n" );
- printf( "For more information, please read\n\n" );
- printf( " Dodd, I. B., and J. B. Egan. 1990. Improved detection of\n" );
- printf( " helix-turn-helix DNA-binding motifs in protein sequences.\n" );
- printf( " Nucleic Acids Res. 18:5019-5026.\n\n" );
- printf( "Please be sure that your protein sequence is in \"plain\" format\n" );
- printf( "using the single-letter code.\n\n" );
-
- /*
- Allocate memory for the protein sequence.
- */
-
- sequence = ( char * ) malloc( MAX_SEQUENCE_LENGTH * sizeof( char ) );
- if ( NULL == sequence )
- status = OUT_OF_MEMORY;
-
- while ( NO_ERROR == status )
- {
-
- /*
- Get the name of the sequence file.
- */
-
- fileNameEntered = FALSE;
- while ( TRUE )
- {
- printf( "Name of protein file (q/Q = Quit): " );
- sprintf( format, "%%%us", FILENAME_MAX );
- scanf( format, fileName );
- length = strlen( fileName );
- if ( 0 != length )
- {
- /*
- Check for quit command.
- */
-
- if ( ( 1 == length ) &&
- ( ( fileName[ 0 ] == 'q' ) || ( fileName[ 0 ] == 'Q' ) ) )
- {
- status = QUIT;
- break;
- }
- else
- {
- /*
- Open the sequence file.
- */
-
- sequenceFile = fopen( fileName, "r" );
- if ( NULL == sequenceFile )
- printf( "File not found!\n\n" );
- else
- break;
- }
- }
- }
-
- if ( NO_ERROR == status )
- {
- /*
- Read the sequence from sequenceFile into sequence[].
- */
-
- i = 0;
- while ( i < MAX_SEQUENCE_LENGTH )
- {
- /*
- Stop reading at end of file.
- */
-
- theChar = getc( sequenceFile );
- if ( feof( sequenceFile ) )
- break;
-
- /*
- Skip white space characters.
- */
-
- if ( isspace( theChar ) )
- continue;
-
- /*
- GetAminoAcid will return AMINO_ACIDS_COUNT if the character
- is not found in AminoAcidsString[].
- */
-
- if ( AMINO_ACIDS_COUNT == GetAminoAcid( ( char ) theChar ) )
- {
- status = INVALID_CHAR;
- break;
- }
-
- /*
- The character is valid; add it to the string.
- */
-
- sequence[ i++ ] = ( char ) theChar;
- }
-
- if ( NO_ERROR == status )
- sequence[ i++ ] = 0x00;
-
- fclose( sequenceFile );
- }
-
- if ( NO_ERROR == status )
- {
- sequenceLength = strlen( sequence );
- if ( sequenceLength < WINDOW_SIZE )
- status = SEQUENCE_TOO_SHORT;
- }
-
- if ( NO_ERROR == status )
- {
- /*
- Calculate the highest score for the sequence.
- */
-
- resultsDisplayed = FALSE;
- maxScore = SHRT_MIN; /* defined in <limits.h> */
- maxScorePosition = 0;
- maxWindowPosition = sequenceLength - WINDOW_SIZE;
- for ( i = 0; i < maxWindowPosition; i++ )
- {
- tempScore = 0;
- for ( j = 0; j < WINDOW_SIZE; j++ )
- {
- position = i + j;
- residue = sequence[ position ];
- aminoAcid = GetAminoAcid( residue );
- tempScore += weightMatrix[ aminoAcid ] [ j ];
- }
- if ( tempScore > maxScore )
- {
- maxScore = tempScore;
- maxScorePosition = i;
- convertedScore =
- ( ( double ) maxScore - NON_HTH_MEAN_SCORE ) /
- ( NON_HTH_STD_DEV );
- if ( convertedScore >= 2.5 )
- {
- DisplayResults(
- convertedScore,
- maxScorePosition,
- sequence );
- resultsDisplayed = TRUE;
- maxScore = SHRT_MIN;
- }
- }
- }
- if ( !resultsDisplayed )
- DisplayResults( convertedScore, maxScorePosition, sequence );
- }
- if ( ( QUIT != status ) && ( NO_ERROR != status ) )
- {
- DisplayError( status );
- status = NO_ERROR;
- }
- }
- DisplayError( status );
- }
-
-
- void DisplayResults(
- double convertedScore,
- size_t maxScorePosition,
- const char *sequence )
- {
- char maxScoreString[ WINDOW_SIZE + 1 ];
- short i,
- percentage;
- size_t position;
-
- printf(
- "The score is %0.2f at position %lu.\n",
- convertedScore,
- maxScorePosition + 1 );
-
- for ( i = 0; i < WINDOW_SIZE; i++ )
- {
- position = maxScorePosition + i;
- maxScoreString[ i ] = sequence[ position ];
- }
- maxScoreString[ i ] = 0x00;
-
- printf(
- "The sequence at this position is %s.\n",
- maxScoreString );
-
- if ( convertedScore >= 4.5 )
- percentage = 100;
- else if ( convertedScore >= 4.0 )
- percentage = 90;
- else if ( convertedScore >= 3.5 )
- percentage = 71;
- else if ( convertedScore >= 3.0 )
- percentage = 50;
- else if ( convertedScore >= 2.5 )
- percentage = 25;
-
- if ( convertedScore < 2.5 )
- printf( "This score is not significant.\n\n" );
- else
- {
- printf(
- "This score suggests an approximately %hd%% probability that ",
- percentage );
- printf( "this protein\ncontains a helix-turn-helix motif.\n\n" );
- }
- }
-
-
- short GetAminoAcid(
- char residue )
- {
- short i,
- limit;
-
- limit = strlen( aminoAcidsString );
- for ( i = 0; i < limit; i++ )
- {
- if ( residue == aminoAcidsString[ i ] )
- break;
- }
- if ( i >= AMINO_ACIDS_COUNT )
- i -= AMINO_ACIDS_COUNT;
-
- return ( i );
- }
-
-
- void DisplayError(
- short error )
- {
-
- char errorString[5][80] =
- {
- "\n\n",
- "The protein sequence is too short to analyze.\n\n",
- "There is insufficient memory to continue.\n\n",
- "Good-bye!\n\n",
- "There is an invalid character in the protein sequence.\n\n"
- };
-
- printf( errorString[ error ] );
- }
-